library(tidyverse)
library(janitor)
library(dbplyr)
library(dplyr)
sales_2016 <- read_csv("sales-2016-with-ratings.csv")
Parsed with column specification:
cols(
Name = [31mcol_character()[39m,
Platform = [31mcol_character()[39m,
Year_of_Release = [31mcol_character()[39m,
Genre = [31mcol_character()[39m,
Publisher = [31mcol_character()[39m,
NA_Sales = [32mcol_double()[39m,
EU_Sales = [32mcol_double()[39m,
JP_Sales = [32mcol_double()[39m,
Other_Sales = [32mcol_double()[39m,
Global_Sales = [32mcol_double()[39m,
Critic_Score = [32mcol_double()[39m,
Critic_Count = [32mcol_double()[39m,
User_Score = [31mcol_character()[39m,
User_Count = [32mcol_double()[39m,
Developer = [31mcol_character()[39m,
Rating = [31mcol_character()[39m
)
glimpse(sales_2016)
Rows: 16,719
Columns: 16
$ Name [3m[38;5;246m<chr>[39m[23m "Wii Sports", "Super Mario Bros.", "Mario Kart Wii", "Wii Sports Resort", "Pokemon Red/Pokemon Blue", "Tetris", "New Super Mario Bros.", "Wii Play", "New Super Mario Bros. Wii", "Du…
$ Platform [3m[38;5;246m<chr>[39m[23m "Wii", "NES", "Wii", "Wii", "GB", "GB", "DS", "Wii", "Wii", "NES", "DS", "DS", "GB", "Wii", "X360", "Wii", "PS3", "PS2", "SNES", "DS", "DS", "GB", "NES", "X360", "PS2", "GBA", "DS",…
$ Year_of_Release [3m[38;5;246m<chr>[39m[23m "2006", "1985", "2008", "2009", "1996", "1989", "2006", "2006", "2009", "1984", "2005", "2005", "1999", "2007", "2010", "2009", "2013", "2004", "1990", "2005", "2006", "1989", "1988…
$ Genre [3m[38;5;246m<chr>[39m[23m "Sports", "Platform", "Racing", "Sports", "Role-Playing", "Puzzle", "Platform", "Misc", "Platform", "Shooter", "Simulation", "Racing", "Role-Playing", "Sports", "Misc", "Sports", "A…
$ Publisher [3m[38;5;246m<chr>[39m[23m "Nintendo", "Nintendo", "Nintendo", "Nintendo", "Nintendo", "Nintendo", "Nintendo", "Nintendo", "Nintendo", "Nintendo", "Nintendo", "Nintendo", "Nintendo", "Nintendo", "Microsoft Ga…
$ NA_Sales [3m[38;5;246m<dbl>[39m[23m 41.36, 29.08, 15.68, 15.61, 11.27, 23.20, 11.28, 13.96, 14.44, 26.93, 9.05, 9.71, 9.00, 8.92, 15.00, 9.01, 7.02, 9.43, 12.78, 4.74, 6.38, 10.83, 9.54, 9.66, 8.41, 6.06, 3.43, 5.51, …
$ EU_Sales [3m[38;5;246m<dbl>[39m[23m 28.96, 3.58, 12.76, 10.93, 8.89, 2.26, 9.14, 9.18, 6.94, 0.63, 10.95, 7.47, 6.18, 8.03, 4.89, 8.49, 9.09, 0.40, 3.75, 9.20, 4.46, 2.71, 3.44, 5.14, 5.49, 3.90, 5.35, 3.17, 5.09, 4.2…
$ JP_Sales [3m[38;5;246m<dbl>[39m[23m 3.77, 6.81, 3.79, 3.28, 10.22, 4.22, 6.50, 2.93, 4.70, 0.28, 1.93, 4.13, 7.20, 3.60, 0.24, 2.53, 0.98, 0.41, 3.54, 4.16, 6.04, 4.18, 3.84, 0.06, 0.47, 5.38, 5.32, 5.65, 1.87, 0.13, …
$ Other_Sales [3m[38;5;246m<dbl>[39m[23m 8.45, 0.77, 3.29, 2.95, 1.00, 0.58, 2.88, 2.84, 2.24, 0.47, 2.74, 1.90, 0.71, 2.15, 1.69, 1.77, 3.96, 10.57, 0.55, 2.04, 1.36, 0.42, 0.46, 1.41, 1.78, 0.50, 1.18, 0.80, 1.16, 1.32, …
$ Global_Sales [3m[38;5;246m<dbl>[39m[23m 82.53, 40.24, 35.52, 32.77, 31.37, 30.26, 29.80, 28.92, 28.32, 28.31, 24.67, 23.21, 23.10, 22.70, 21.81, 21.79, 21.04, 20.81, 20.61, 20.15, 18.25, 18.14, 17.28, 16.27, 16.15, 15.85,…
$ Critic_Score [3m[38;5;246m<dbl>[39m[23m 76, NA, 82, 80, NA, NA, 89, 58, 87, NA, NA, 91, NA, 80, 61, 80, 97, 95, NA, 77, NA, NA, NA, 97, 95, NA, 77, NA, 95, 88, NA, NA, 87, NA, 83, 83, 94, 88, 97, 93, 85, 88, 97, 86, 94, N…
$ Critic_Count [3m[38;5;246m<dbl>[39m[23m 51, NA, 73, 73, NA, NA, 65, 41, 80, NA, NA, 64, NA, 63, 45, 33, 50, 80, NA, 58, NA, NA, NA, 58, 62, NA, 37, NA, 54, 81, NA, NA, 89, NA, 21, 73, 100, 39, 56, 81, 73, 58, 66, 57, 86, …
$ User_Score [3m[38;5;246m<chr>[39m[23m "8", NA, "8.3", "8", NA, NA, "8.5", "6.6", "8.4", NA, NA, "8.6", NA, "7.7", "6.3", "7.4", "8.2", "9", NA, "7.9", NA, NA, NA, "8.1", "8.7", NA, "7.1", NA, "8.4", "3.4", NA, NA, "6.3"…
$ User_Count [3m[38;5;246m<dbl>[39m[23m 322, NA, 709, 192, NA, NA, 431, 129, 594, NA, NA, 464, NA, 146, 106, 52, 3994, 1588, NA, 50, NA, NA, NA, 3711, 730, NA, 19, NA, 314, 8713, NA, NA, 1454, NA, 922, 2256, 2698, 5234, 6…
$ Developer [3m[38;5;246m<chr>[39m[23m "Nintendo", NA, "Nintendo", "Nintendo", NA, NA, "Nintendo", "Nintendo", "Nintendo", NA, NA, "Nintendo", NA, "Nintendo", "Good Science Studio", "Nintendo", "Rockstar North", "Rocksta…
$ Rating [3m[38;5;246m<chr>[39m[23m "E", NA, "E", "E", NA, NA, "E", "E", "E", NA, NA, "E", NA, "E", "E", "E", "M", "M", NA, "E", NA, NA, NA, "M", "M", NA, "E", NA, "E", "M", NA, NA, "M", NA, "M", "M", "M", "M", "M", "…
head(sales_2016)
Look for missing variables and n/a. Will need to drop_na()
summary(sales_2016)
Name Platform Year_of_Release Genre Publisher NA_Sales EU_Sales JP_Sales Other_Sales Global_Sales Critic_Score
Length:16719 Length:16719 Length:16719 Length:16719 Length:16719 Min. : 0.0000 Min. : 0.000 Min. : 0.0000 Min. : 0.00000 Min. : 0.0100 Min. :13.00
Class :character Class :character Class :character Class :character Class :character 1st Qu.: 0.0000 1st Qu.: 0.000 1st Qu.: 0.0000 1st Qu.: 0.00000 1st Qu.: 0.0600 1st Qu.:60.00
Mode :character Mode :character Mode :character Mode :character Mode :character Median : 0.0800 Median : 0.020 Median : 0.0000 Median : 0.01000 Median : 0.1700 Median :71.00
Mean : 0.2633 Mean : 0.145 Mean : 0.0776 Mean : 0.04733 Mean : 0.5335 Mean :68.97
3rd Qu.: 0.2400 3rd Qu.: 0.110 3rd Qu.: 0.0400 3rd Qu.: 0.03000 3rd Qu.: 0.4700 3rd Qu.:79.00
Max. :41.3600 Max. :28.960 Max. :10.2200 Max. :10.57000 Max. :82.5300 Max. :98.00
NA's :8582
Critic_Count User_Score User_Count Developer Rating
Min. : 3.00 Length:16719 Min. : 4.0 Length:16719 Length:16719
1st Qu.: 12.00 Class :character 1st Qu.: 10.0 Class :character Class :character
Median : 21.00 Mode :character Median : 24.0 Mode :character Mode :character
Mean : 26.36 Mean : 162.2
3rd Qu.: 36.00 3rd Qu.: 81.0
Max. :113.00 Max. :10665.0
NA's :8582 NA's :9129
names(sales_2016)
[1] "Name" "Platform" "Year_of_Release" "Genre" "Publisher" "NA_Sales" "EU_Sales" "JP_Sales" "Other_Sales" "Global_Sales" "Critic_Score"
[12] "Critic_Count" "User_Score" "User_Count" "Developer" "Rating"
sales_2016_clean <- sales_2016 %>%
clean_names()
names(sales_2016_clean)
[1] "name" "platform" "year_of_release" "genre" "publisher" "na_sales" "eu_sales" "jp_sales" "other_sales" "global_sales" "critic_score"
[12] "critic_count" "user_score" "user_count" "developer" "rating"
From the information I note:
na_sales = North American sales eu_sales = European sales jp_sales = Japan sales other_sales = rest of world sales global_sales = sum of all sales
Also year_of_release is character class not an intiger, this will have to be changed.
sales_2019 <- read_csv("sales-2019.csv")
Parsed with column specification:
cols(
.default = col_double(),
Name = [31mcol_character()[39m,
basename = [31mcol_character()[39m,
Genre = [31mcol_character()[39m,
ESRB_Rating = [31mcol_character()[39m,
Platform = [31mcol_character()[39m,
Publisher = [31mcol_character()[39m,
Developer = [31mcol_character()[39m,
VGChartz_Score = [33mcol_logical()[39m,
Last_Update = [31mcol_character()[39m,
url = [31mcol_character()[39m,
img_url = [31mcol_character()[39m
)
See spec(...) for full column specifications.
glimpse(sales_2019)
Rows: 55,792
Columns: 23
$ Rank [3m[38;5;246m<dbl>[39m[23m 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32, 33, 34, 35, 36, 37, 38, 39, 40, 41, 42, 43, 44, 45, 46, 47, 48,…
$ Name [3m[38;5;246m<chr>[39m[23m "Wii Sports", "Super Mario Bros.", "Mario Kart Wii", "PlayerUnknown's Battlegrounds", "Wii Sports Resort", "Pokemon Red / Green / Blue Version", "New Super Mario Bros.", "Tetris", "N…
$ basename [3m[38;5;246m<chr>[39m[23m "wii-sports", "super-mario-bros", "mario-kart-wii", "playerunknowns-battlegrounds", "wii-sports-resort", "pokmon-red", "new-super-mario-bros", "tetris", "new-super-mario-bros-wii", "…
$ Genre [3m[38;5;246m<chr>[39m[23m "Sports", "Platform", "Racing", "Shooter", "Sports", "Role-Playing", "Platform", "Puzzle", "Platform", "Misc", "Shooter", "Misc", "Party", "Simulation", "Racing", "Role-Playing", "Sp…
$ ESRB_Rating [3m[38;5;246m<chr>[39m[23m "E", NA, "E", NA, "E", "E", "E", "E", "E", NA, NA, "E", "E", "E", "E", "E", "E", "E", "E", "M", "M", "E", NA, "E", "E", "E", "M", "E", "E", "E", "M", "E", "M", "E", "M", "E", "M", NA…
$ Platform [3m[38;5;246m<chr>[39m[23m "Wii", "NES", "Wii", "PC", "Wii", "GB", "DS", "GB", "Wii", "PC", "NES", "Wii", "X360", "DS", "DS", "GB", "Wii", "Wii", "SNES", "PS3", "PS4", "DS", "PC", "GB", "3DS", "DS", "PS2", "NE…
$ Publisher [3m[38;5;246m<chr>[39m[23m "Nintendo", "Nintendo", "Nintendo", "PUBG Corporation", "Nintendo", "Nintendo", "Nintendo", "Nintendo", "Nintendo", "Mojang", "Nintendo", "Nintendo", "Microsoft Game Studios", "Ninte…
$ Developer [3m[38;5;246m<chr>[39m[23m "Nintendo EAD", "Nintendo EAD", "Nintendo EAD", "PUBG Corporation", "Nintendo EAD", "Game Freak", "Nintendo EAD", "Bullet Proof Software", "Nintendo EAD", "Mojang AB", "Nintendo R&D1…
$ VGChartz_Score [3m[38;5;246m<lgl>[39m[23m NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA…
$ Critic_Score [3m[38;5;246m<dbl>[39m[23m 7.7, 10.0, 8.2, NA, 8.0, 9.4, 9.1, NA, 8.6, 10.0, NA, 5.9, 6.7, 8.4, 9.1, 9.2, 7.9, 8.0, 8.5, 9.4, 9.7, 8.1, NA, NA, 8.2, 8.6, 9.5, NA, 8.9, 8.8, 9.6, 9.0, NA, 8.6, NA, 9.3, 9.3, NA,…
$ User_Score [3m[38;5;246m<dbl>[39m[23m NA, NA, 9.1, NA, 8.8, NA, 8.1, NA, 9.2, NA, NA, 4.5, NA, NA, 9.4, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, 9.7, NA, NA, NA, NA, 9.0, NA, NA, NA, NA, NA, NA, NA, NA, NA, 9.…
$ Total_Shipped [3m[38;5;246m<dbl>[39m[23m 82.86, 40.24, 37.14, 36.60, 33.09, 31.38, 30.80, 30.26, 30.22, 30.01, 28.31, 28.02, 24.00, 23.96, 23.60, 23.10, 22.67, 21.13, 20.61, NA, NA, 19.01, 18.58, 18.14, 18.11, 17.67, 17.30,…
$ Global_Sales [3m[38;5;246m<dbl>[39m[23m NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, 20.32, 19.39, NA, NA, NA, NA, NA, NA, NA, NA, NA, 16.15, NA, 15.86, NA, 15.09, NA, NA, NA, NA, NA, 14.82, …
$ NA_Sales [3m[38;5;246m<dbl>[39m[23m NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, 6.37, 6.06, NA, NA, NA, NA, NA, NA, NA, NA, NA, 8.41, NA, 9.06, NA, 6.18, NA, NA, NA, NA, NA, 9.07, 9.76, …
$ PAL_Sales [3m[38;5;246m<dbl>[39m[23m NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, 9.85, 9.71, NA, NA, NA, NA, NA, NA, NA, NA, NA, 5.49, NA, 5.33, NA, 6.05, NA, NA, NA, NA, NA, 4.29, 3.73, …
$ JP_Sales [3m[38;5;246m<dbl>[39m[23m NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, 0.99, 0.60, NA, NA, NA, NA, NA, NA, NA, NA, NA, 0.47, NA, 0.06, NA, 0.41, NA, NA, NA, NA, NA, 0.13, 0.11, …
$ Other_Sales [3m[38;5;246m<dbl>[39m[23m NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, 3.12, 3.02, NA, NA, NA, NA, NA, NA, NA, NA, NA, 1.78, NA, 1.42, NA, 2.44, NA, NA, NA, NA, NA, 1.33, 1.14, …
$ Year [3m[38;5;246m<dbl>[39m[23m 2006, 1985, 2008, 2017, 2009, 1998, 2006, 1989, 2009, 2010, 1985, 2007, 2010, 2005, 2005, 2000, 2008, 2009, 1991, 2013, 2014, 2006, 2004, 1989, 2011, 2007, 2004, 1990, 2013, 2003, 20…
$ Last_Update [3m[38;5;246m<chr>[39m[23m NA, NA, "11th Apr 18", "13th Nov 18", NA, NA, NA, NA, NA, "05th Aug 18", NA, NA, "29th Sep 18", NA, NA, NA, NA, NA, "10th Aug 18", NA, "03rd Jan 18", NA, NA, NA, NA, NA, NA, "09th Au…
$ url [3m[38;5;246m<chr>[39m[23m "http://www.vgchartz.com/game/2667/wii-sports/?region=All", "http://www.vgchartz.com/game/6455/super-mario-bros/?region=All", "http://www.vgchartz.com/game/6968/mario-kart-wii/?regio…
$ status [3m[38;5;246m<dbl>[39m[23m 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,…
$ Vgchartzscore [3m[38;5;246m<dbl>[39m[23m NA, NA, 8.7, NA, 8.8, NA, NA, NA, 9.1, NA, NA, 5.3, 6.2, NA, NA, NA, 8.0, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, 8.8,…
$ img_url [3m[38;5;246m<chr>[39m[23m "/games/boxart/full_2258645AmericaFrontccc.jpg", "/games/boxart/8972270ccc.jpg", "/games/boxart/full_8932480AmericaFrontccc.jpg", "/games/boxart/full_8052843AmericaFrontccc.jpg", "/g…
head(sales_2019)
names(sales_2019)
[1] "Rank" "Name" "basename" "Genre" "ESRB_Rating" "Platform" "Publisher" "Developer" "VGChartz_Score" "Critic_Score" "User_Score" "Total_Shipped"
[13] "Global_Sales" "NA_Sales" "PAL_Sales" "JP_Sales" "Other_Sales" "Year" "Last_Update" "url" "status" "Vgchartzscore" "img_url"
Look for missing variables and n/a. Will need to drop_na()
summary(sales_2019)
Rank Name basename Genre ESRB_Rating Platform Publisher Developer VGChartz_Score Critic_Score User_Score
Min. : 1 Length:55792 Length:55792 Length:55792 Length:55792 Length:55792 Length:55792 Length:55792 Mode:logical Min. : 1.00 Min. : 2.00
1st Qu.:13949 Class :character Class :character Class :character Class :character Class :character Class :character Class :character NA's:55792 1st Qu.: 6.40 1st Qu.: 7.80
Median :27896 Mode :character Mode :character Mode :character Mode :character Mode :character Mode :character Mode :character Median : 7.50 Median : 8.50
Mean :27896 Mean : 7.21 Mean : 8.25
3rd Qu.:41844 3rd Qu.: 8.30 3rd Qu.: 9.10
Max. :55792 Max. :10.00 Max. :10.00
NA's :49256 NA's :55457
Total_Shipped Global_Sales NA_Sales PAL_Sales JP_Sales Other_Sales Year Last_Update url status Vgchartzscore img_url
Min. : 0.03 Min. : 0.00 Min. :0.00 Min. :0.00 Min. :0.00 Min. :0.00 Min. :1970 Length:55792 Length:55792 Min. :1 Min. :2.60 Length:55792
1st Qu.: 0.20 1st Qu.: 0.03 1st Qu.:0.05 1st Qu.:0.01 1st Qu.:0.02 1st Qu.:0.00 1st Qu.:2000 Class :character Class :character 1st Qu.:1 1st Qu.:6.80 Class :character
Median : 0.59 Median : 0.12 Median :0.12 Median :0.04 Median :0.05 Median :0.01 Median :2008 Mode :character Mode :character Median :1 Median :7.80 Mode :character
Mean : 1.89 Mean : 0.37 Mean :0.28 Mean :0.16 Mean :0.11 Mean :0.04 Mean :2006 Mean :1 Mean :7.43
3rd Qu.: 1.80 3rd Qu.: 0.36 3rd Qu.:0.29 3rd Qu.:0.14 3rd Qu.:0.12 3rd Qu.:0.04 3rd Qu.:2011 3rd Qu.:1 3rd Qu.:8.50
Max. :82.86 Max. :20.32 Max. :9.76 Max. :9.85 Max. :2.69 Max. :3.12 Max. :2020 Max. :1 Max. :9.60
NA's :53965 NA's :36377 NA's :42828 NA's :42603 NA's :48749 NA's :40270 NA's :979 NA's :54993
sales_2019_clean <- sales_2019 %>%
clean_names()
names(sales_2019_clean)
[1] "rank" "name" "basename" "genre" "esrb_rating" "platform" "publisher" "developer" "vg_chartz_score" "critic_score" "user_score"
[12] "total_shipped" "global_sales" "na_sales" "pal_sales" "jp_sales" "other_sales" "year" "last_update" "url" "status" "vgchartzscore"
[23] "img_url"
##sales_2019_clean %>%
##ggplot(aes(x = name, y = total_shipped)) +
##geom_point()
##sales_2016_clean %>%
##ggplot(aes(x = name, y = global_sales)) +
##geom_point()
There are a few outliers (higher sales) in the data
sales_2016_clean %>%
arrange(desc(global_sales))
sales_2016_clean
boxplot(sales_2016_clean$global_sales)
The outlier is Wii Sports released in 2006
view(sales_2016_clean)
view(sales_2019_clean)
sales_2019_clean %>%
arrange(desc(total_shipped))
sales_2019_clean
sales_2019_clean %>%
arrange(desc(total_shipped))
sales_2019_clean
boxplot(sales_2019_clean$total_shipped)
sales_2016_clean %>%
select(name, critic_score, global_sales) %>%
arrange(desc(critic_score))
sales_2019_clean %>%
select(name, critic_score, total_shipped) %>%
arrange(desc(critic_score))
sales_2016_clean %>%
distinct(rating)
sales_2019_clean %>%
distinct(esrb_rating)
esrb_rating and rating refer to the esrb_rating categories are the Rating Categories suggest age appropriateness. E = Everyone KA = known as Kids to Adults (KA) until 1998, when renamed “Everyone” EC = Early childhood (years 3+) E10 = Everyoen over age of 10 T = Teen M = Mature (years 17+) AO = Adults only (years 18+)
RP = Rating pending
Looking at the vgchartzscore and vg_chartz_score columns.
sales_2019_clean %>%
distinct(vgchartzscore)
vg_chartz_score has nothing in any cells
sales_2019_clean %>%
distinct(vg_chartz_score)
sales_2019_clean %>%
filter(total_shipped > 0) %>%
count(global_sales)
sales_2019_clean %>%
filter(total_shipped > 0) %>%
count(na_sales)
sales_2019_clean %>%
filter(total_shipped > 0) %>%
count(pal_sales)
sales_2019_clean %>%
filter(total_shipped > 0) %>%
count(jp_sales)
sales_2019_clean %>%
filter(total_shipped > 0) %>%
count(other_sales)
sales_2019_clean %>%
filter(global_sales > 0) %>%
select(global_sales, na_sales, pal_sales, jp_sales, other_sales) %>%
summary
global_sales na_sales pal_sales jp_sales other_sales
Min. : 0.0100 Min. :0.00 Min. :0.000 Min. :0.000 Min. :0.000
1st Qu.: 0.0500 1st Qu.:0.05 1st Qu.:0.020 1st Qu.:0.020 1st Qu.:0.000
Median : 0.1400 Median :0.12 Median :0.050 Median :0.050 Median :0.010
Mean : 0.3935 Mean :0.28 Mean :0.169 Mean :0.114 Mean :0.048
3rd Qu.: 0.4000 3rd Qu.:0.29 3rd Qu.:0.160 3rd Qu.:0.130 3rd Qu.:0.040
Max. :20.3200 Max. :9.76 Max. :9.850 Max. :2.690 Max. :3.120
NA's :5258 NA's :5910 NA's :11244 NA's :3525
global_sales_total <- sales_2019_clean %>%
filter(global_sales > 0) %>%
select(global_sales) %>%
sum()
global_sales_total
[1] 7096.24
na_sales_total <- sales_2019_clean %>%
filter(na_sales > 0) %>%
select(na_sales) %>%
sum()
na_sales_total
[1] 3572.12
pal_sales_total <- sales_2019_clean %>%
filter(pal_sales > 0) %>%
select(pal_sales) %>%
sum()
pal_sales_total
[1] 2047.76
jp_sales_total <- sales_2019_clean %>%
filter(jp_sales > 0) %>%
select(jp_sales) %>%
sum()
jp_sales_total
[1] 777.56
other_sales_total <- sales_2019_clean %>%
filter(other_sales > 0) %>%
select(other_sales) %>%
sum()
other_sales_total
[1] 694.13
So what is the difference between global sales and all of the ohter regions
global_sales_total - (jp_sales_total + na_sales_total + other_sales_total + pal_sales_total)
[1] 4.67
sales_2019_clean %>%
filter(last_update > 0) %>%
distinct(name, last_update) %>%
arrange(desc(last_update))
As the only variable i’ve not yet looked at
sales_2019_clean %>%
distinct(status)
sales_2019_clean %>%
filter(vgchartzscore > 0) %>%
select(name, critic_score, user_score, vgchartzscore)
Which games have 0 sales across all regions (filter by global_sales)
sales_2019_clean %>%
filter(global_sales == 0) %>%
select(name, na_sales, pal_sales, jp_sales, other_sales)
sales_2019_clean
sales_2019_clean %>%
select(name, platform, global_sales, na_sales, pal_sales, jp_sales, other_sales)
NA
Is there more than 1 entry for each game? ie games have entries for each year There are 37,102 distint rows compared to 55,792 rows in the data set, a difference of 18,690
sales_2019_clean %>%
distinct(name) %>%
count()
NA
sales_2019_clean %>%
count(name)
NA
There are 61 names with more than 1 entry
sales_2019_clean %>%
count(name > 1) %>%
arrange()
NA
sales_2019_clean %>%
group_by(name, year) %>%
filter(n() >1) %>%
count()
NA
So there are multiple entries under the same name for each platform eg 2010 FIFA World Cup South Africa
sales_2019_clean %>%
filter(name == "2010 FIFA World Cup South Africa")
NA
sales_2019_clean %>%
filter(name == "007: Quantum of Solace")
NA
sales_2019_clean %>%
filter(name == "007: Quantum of Solace") %>%
ggplot(aes(x = platform, y = total_shipped)) +
geom_point()
There looks to be a difference between sales for example for 007: Quantum of Solace across platforms, so lets have a look at all platforms
sales_2019_clean %>%
filter(total_shipped != 0) %>%
ggplot(aes(x = platform, y = total_shipped)) +
geom_point()
sales_2019_clean %>%
filter(total_shipped >10) %>%
ggplot(aes(x = platform, y = total_shipped)) +
geom_point()
Lets count by platform
sales_2019_clean %>%
group_by(platform) %>%
filter(n() > 1) %>%
arrange(n()) %>%
count()
NA
sales_2019_clean %>%
group_by(platform) %>%
arrange(n()) %>%
count()
NA
sales_2019_clean %>%
group_by(platform) %>%
count()
NA
sales_2019_clean_platform <- sales_2019_clean %>%
group_by(platform) %>%
mutate("platform_sales" = n())
sales_2019_clean_platform
NA
sales_2019_clean_platform %>%
count() %>%
arrange(desc(n))
Now lets have a look at Genre in the dataset
sales_2019_clean %>%
filter(total_shipped >10) %>%
ggplot(aes(x = genre, y = total_shipped)) +
geom_point()
sales_2019_clean %>%
group_by(genre) %>%
count()
NA
Also whatis the best publisher?
sales_2019_clean %>%
filter(total_shipped > 5) %>%
ggplot(aes(x = total_shipped, y = publisher)) +
geom_point()
Lets look at critic_score
sales_2019_clean %>%
filter(critic_score > 9.5) %>%
ggplot(aes(x = critic_score, y = name)) +
geom_point()
Summary of findings and comparision between the 2 datasets 1. sales_2016 and 2. sales_2019
2 datasets 2016 and 2019
2016
“Critic_Count” “User_Count”
2019 v 2016 “Rank” = n/a [1] “Name” = “Name” “basename” = n/a (games names ) “Genre” = “Genre” “ESRB_Rating” = “Rating” As above this rating references if games are age appropriate “Platform” = “Platform” “Publisher” = “Publisher” “Developer” = “Developer” “vg_chartz_score” = n/a (also contains no variables) [9] “VGChartz_Score”= n/a “Critic_Score” = “Critic_Score” (2016 and 2019 are scored differently. From 100 in 2016 and from 10 in 2019) “User_Score” = [13] “User_Score” “Total_Shipped” = n/a “Global_Sales” = “Global_Sales” “NA_Sales” = “NA_Sales”
“PAL_Sales” != [7] “EU_Sales” (PAL usually refers to Asia, Africa, Europe, South America and Oceania) SO these dont natch “JP_Sales” = “JP_Sales” [17] “Other_Sales” = “Other_Sales” “Year” = “Year_of_Release” “Last_Update” = n/a “url” = n/a “status” = n/a. No variables in “Vgchartzscore” = n/a “img_url” = n/a
Convert years to Intiger
###sales_2016_clean <- sales_2016_clean %>%
#mutate(
#year_of_release = as.integer(year_of_release))
#sales_2016_clean
###sales_2019_clean <- sales_2019_clean %>%
#mutate(
# year = as.integer(year))
#sales_2019_clean
#### sales_2019_clean <- sales_2019_clean %>%
#mutate(
# year = as.integer(year))
#sales_2019_clean
sales_clean_2019_select_var <- sales_2019_clean %>%
select(-basename,
-vg_chartz_score,
-url,
-status,
-img_url)
sales_clean_2019_select_var
I still need to look at and possibly remove the NA’s
Lets amend the dates in for sales to 2019
sales_clean_2019_year <- sales_clean_2019_select_var %>%
rename(
"total_shipped_2019" = "total_shipped",
"global_sales_2019" = "global_sales",
"na_sales_2019" = "na_sales",
"eu_sales_2019" = "pal_sales",
"jp_sales_2019" = "jp_sales",
"other_sales_2019" = "other_sales")
sales_clean_2019_year
view(sales_clean_2019_year)
Now lets do the same for 2016
sales_2016_clean_select <- sales_2016_clean %>%
select(-year_of_release,
-critic_count,
-user_count,
-user_score,
-rating)
sales_2016_clean_select
sales_clean_2016_year <- sales_2016_clean_select %>%
rename(
"na_sales_2016" = "na_sales",
"eu_sales_2016" = "eu_sales",
"jp_sales_2016" = "jp_sales",
"other_sales_2016" = "other_sales",
"global_sales_2016" = "global_sales")
sales_clean_2016_year
sales_clean_2019_select_var_bind <- sales_clean_2019_select_var %>%
rename(
"eu_sales" = "pal_sales"
)
sales_clean_2019_select_var_bind
sales_2016_clean_select_bind <- sales_2016_clean_select %>%
mutate(year_data = 2016)
sales_2019_clean_select_bind <- sales_clean_2019_select_var_bind %>%
mutate(year_data = 2019)
games_sales_full_bind <- bind_rows(sales_2016_clean_select_bind, sales_2019_clean_select_bind)
games_sales_full_bind
unique(games_sales_full_bind$year_data)
[1] 2016 2019
games_sales_full_bind_select <- games_sales_full_bind %>%
select(
-user_score)
games_sales_full_bind_select
names(sales_2016_clean)
[1] "name" "platform" "year_of_release" "genre" "publisher" "na_sales" "eu_sales" "jp_sales" "other_sales" "global_sales" "critic_score"
[12] "critic_count" "user_score" "user_count" "developer" "rating"
names(sales_2019_clean)
[1] "rank" "name" "basename" "genre" "esrb_rating" "platform" "publisher" "developer" "vg_chartz_score" "critic_score" "user_score"
[12] "total_shipped" "global_sales" "na_sales" "pal_sales" "jp_sales" "other_sales" "year" "last_update" "url" "status" "vgchartzscore"
[23] "img_url"
games_sales_full_bind_total <- mutate(games_sales_full_bind, total_global_sales = total_shipped + global_sales)
games_sales_full_bind_total
games_sales_full_bind_total %>%
arrange(total_global_sales)
games_sales_full_bind_total
summary(games_sales_full_bind_total)
name platform genre publisher na_sales eu_sales jp_sales other_sales global_sales critic_score developer year_data
Length:72511 Length:72511 Length:72511 Length:72511 Min. : 0.00 Min. : 0.00 Min. : 0.00 Min. : 0.00 Min. : 0.00 Min. : 1.00 Length:72511 Min. :2016
Class :character Class :character Class :character Class :character 1st Qu.: 0.03 1st Qu.: 0.00 1st Qu.: 0.00 1st Qu.: 0.00 1st Qu.: 0.05 1st Qu.: 7.70 Class :character 1st Qu.:2019
Mode :character Mode :character Mode :character Mode :character Median : 0.10 Median : 0.03 Median : 0.01 Median : 0.01 Median : 0.14 Median :49.00 Mode :character Median :2019
Mean : 0.27 Mean : 0.15 Mean : 0.09 Mean : 0.05 Mean : 0.44 Mean :41.46 Mean :2018
3rd Qu.: 0.26 3rd Qu.: 0.12 3rd Qu.: 0.07 3rd Qu.: 0.03 3rd Qu.: 0.41 3rd Qu.:72.00 3rd Qu.:2019
Max. :41.36 Max. :28.96 Max. :10.22 Max. :10.57 Max. :82.53 Max. :98.00 Max. :2019
NA's :42828 NA's :42603 NA's :48749 NA's :40270 NA's :36377 NA's :57838
rank esrb_rating user_score total_shipped year last_update vgchartzscore total_global_sales
Min. : 1 Length:72511 Min. : 2.00 Min. : 0.03 Min. :1970 Length:72511 Min. :2.60 Min. : NA
1st Qu.:13949 Class :character 1st Qu.: 7.80 1st Qu.: 0.20 1st Qu.:2000 Class :character 1st Qu.:6.80 1st Qu.: NA
Median :27896 Mode :character Median : 8.50 Median : 0.59 Median :2008 Mode :character Median :7.80 Median : NA
Mean :27896 Mean : 8.25 Mean : 1.89 Mean :2006 Mean :7.43 Mean :NaN
3rd Qu.:41844 3rd Qu.: 9.10 3rd Qu.: 1.80 3rd Qu.:2011 3rd Qu.:8.50 3rd Qu.: NA
Max. :55792 Max. :10.00 Max. :82.86 Max. :2020 Max. :9.60 Max. : NA
NA's :16719 NA's :72176 NA's :70684 NA's :17698 NA's :71712 NA's :72511
write.csv(games_sales_full_bind_total, "games_sales_full_bind_total.csv")
games_sales_full_bind_total_longer <- games_sales_full_bind_total %>%
pivot_longer(cols = c("na_sales", "eu_sales", "jp_sales", "other_sales"),
names_to = "sales_region",
values_to = "sales_region_millions")
games_sales_full_bind_total_longer
games_sales_full_bind_total_longer %>%
filter(sales_region == "na_sales") %>%
summarise(Total = sum(sales_region_millions, na.rm = TRUE))
games_sales_full_bind_total_longer %>%
group_by(sales_region, year_data) %>%
summarise(Total = sum(sales_region_millions, na.rm = TRUE))
`summarise()` regrouping output by 'sales_region' (override with `.groups` argument)
write.csv(games_sales_full_bind_total_longer, "games_sales_full_bind_total_longer.csv")
I also need to amend pal_sales to eu_sales
games_sales_full <- left_join(sales_clean_2019_year, sales_clean_2016_year)
Joining, by = c("name", "genre", "platform", "publisher", "developer", "critic_score")
games_sales_full
dim(games_sales_full)
[1] 55792 23
view(games_sales_full)
summary(games_sales_full)
rank name genre esrb_rating platform publisher developer critic_score user_score total_shipped_2019 global_sales_2019
Min. : 1 Length:55792 Length:55792 Length:55792 Length:55792 Length:55792 Length:55792 Min. : 1.00 Min. : 2.00 Min. : 0.03 Min. : 0.00
1st Qu.:13949 Class :character Class :character Class :character Class :character Class :character Class :character 1st Qu.: 6.40 1st Qu.: 7.80 1st Qu.: 0.20 1st Qu.: 0.03
Median :27896 Mode :character Mode :character Mode :character Mode :character Mode :character Mode :character Median : 7.50 Median : 8.50 Median : 0.59 Median : 0.12
Mean :27896 Mean : 7.21 Mean : 8.25 Mean : 1.89 Mean : 0.37
3rd Qu.:41844 3rd Qu.: 8.30 3rd Qu.: 9.10 3rd Qu.: 1.80 3rd Qu.: 0.36
Max. :55792 Max. :10.00 Max. :10.00 Max. :82.86 Max. :20.32
NA's :49256 NA's :55457 NA's :53965 NA's :36377
na_sales_2019 eu_sales_2019 jp_sales_2019 other_sales_2019 year last_update vgchartzscore na_sales_2016 eu_sales_2016 jp_sales_2016 other_sales_2016 global_sales_2016
Min. :0.00 Min. :0.00 Min. :0.00 Min. :0.00 Min. :1970 Length:55792 Min. :2.60 Min. :0.00 Min. :0.00 Min. :0.00 Min. :0.00 Min. :0.01
1st Qu.:0.05 1st Qu.:0.01 1st Qu.:0.02 1st Qu.:0.00 1st Qu.:2000 Class :character 1st Qu.:6.80 1st Qu.:0.03 1st Qu.:0.00 1st Qu.:0.00 1st Qu.:0.00 1st Qu.:0.06
Median :0.12 Median :0.04 Median :0.05 Median :0.01 Median :2008 Mode :character Median :7.80 Median :0.09 Median :0.01 Median :0.00 Median :0.01 Median :0.13
Mean :0.28 Mean :0.16 Mean :0.11 Mean :0.04 Mean :2006 Mean :7.43 Mean :0.14 Mean :0.06 Mean :0.01 Mean :0.02 Mean :0.23
3rd Qu.:0.29 3rd Qu.:0.14 3rd Qu.:0.12 3rd Qu.:0.04 3rd Qu.:2011 3rd Qu.:8.50 3rd Qu.:0.18 3rd Qu.:0.05 3rd Qu.:0.00 3rd Qu.:0.02 3rd Qu.:0.27
Max. :9.76 Max. :9.85 Max. :2.69 Max. :3.12 Max. :2020 Max. :9.60 Max. :1.22 Max. :1.23 Max. :0.81 Max. :0.25 Max. :2.15
NA's :42828 NA's :42603 NA's :48749 NA's :40270 NA's :979 NA's :54993 NA's :55238 NA's :55238 NA's :55238 NA's :55238 NA's :55238
Replace NA with 0
games_sales_full_na <- games_sales_full %>%
mutate(total_shipped_2019 = replace(total_shipped_2019,
is.na(total_shipped_2019), 0
)) %>%
mutate(global_sales_2019 = replace(global_sales_2019,
is.na(global_sales_2019), 0
)) %>%
mutate(global_sales_2016 = replace(global_sales_2016,
is.na(global_sales_2016), 0
))
games_sales_full_na
games_sales_full_total <- mutate(games_sales_full_na, total_global_sales = total_shipped_2019 + global_sales_2019 + global_sales_2016)
games_sales_full_total
games_sales_full_total %>%
arrange(games_sales_full_total)
games_sales_full_total
view(games_sales_full_total)
write.csv(games_sales_full_longer, "games_sales_full_longer.csv")